library(tidyverse)
library(skimr)
library(stringr)
library(ggpubr)
library(flextable)
library(readr)
library(knitr)
library(naniar)
library(plotly)
games <- read.csv("/Users/williamgifford/Final Project Submission/Data/games.csv")
player_play <- read.csv("/Users/williamgifford/Final Project Submission/Data/player_play.csv")
players <- read.csv("/Users/williamgifford/Final Project Submission/Data/players.csv")
plays <- read.csv("/Users/williamgifford/Final Project Submission/Data/plays.csv")

Data Dictionaries

Table: Data Dictionary for games.csv
Variable Description
gameId Game identifier, unique (numeric)
season Season of game (numeric)
week Week of game (numeric)
gameDate Game Date (time, mm/dd/yyyy)
gameTimeEastern Start time of game (time, HH:MM:SS, EST)
homeTeamAbbr Home team three-letter code (text)
visitorTeamAbbr Visiting team three-letter code (text)
homeFinalScore The total amount of points scored by the home team in the game (numeric)
visitorFinalScore The total amount of points scored by the visiting team in the game (numeric)
Table: Data Dictionary for players.csv
Variable Description
nflId Player identification number, unique across players (numeric)
height Player height (text)
weight Player weight (numeric)
birthDate Date of birth (YYYY-MM-DD)
collegeName Player college (text)
position Official player position (text)
displayName Player name (text)
Table: Data Dictionary for plays.csv
Variable Description
gameId Game identifier, unique (numeric)
playId Play identifier, not unique across games (numeric)
playDescription Description of play (text)
quarter Game quarter (numeric)
down Down (numeric)
yardsToGo Distance needed for a first down (numeric)
possessionTeam Team abbr of team on offense with possession of ball (text)
defensiveTeam Team abbr of team on defense (text)
yardlineSide 3-letter team code corresponding to line-of-scrimmage (text)
yardlineNumber Yard line at line-of-scrimmage (numeric)
gameClock Time on clock of play (MM:SS)
preSnapHomeScore Home score prior to the play (numeric)
preSnapVisitorScore Visiting team score prior to the play (numeric)
playNullifiedByPenalty Whether or not an accepted penalty on the play cancels the play outcome (text)
absoluteYardlineNumber Distance from end zone for possession team (numeric)
preSnapHomeTeamWinProbability The win probability of the home team before the play (numeric)
preSnapVisitorTeamWinProbability The win probability of the visiting team before the play (numeric)
expectedPoints Expected points on this play (numeric)
offenseFormation Formation used by possession team (text)
receiverAlignment Enumerated as 0x0, 1x0, 1x1, 2x0, 2x1, 2x2, 3x0, 3x1, 3x2 (text)
playClockAtSnap What the play clock value was at time of snap (numeric)
passResult Dropback outcome of the play (text)
passLength The distance beyond the LOS that the ball traveled not including yards into the endzone. If thrown behind LOS, the value is negative. (numeric)
targetX The x-coordinate of the targeted receiver when the pass arrived (numeric)
targetY The y-coordinate of the targeted receiver when the pass arrived (numeric)
playAction Boolean indicating whether there was play-action on the play (Boolean)
dropbackType The type of drop back after the snap by the QB (text)
dropbackDistance The distance the QB dropped back (yards) behind the center after the snap (numeric)
passLocationType The location type of where the QB was at the time of throw (text)
timeToThrow The time (secs) elapsed between snap and pass (numeric)
timeInTackleBox The amount of time the QB spent inside the tackle box (numeric)
timeToSack The time from snap to the time the QB was sacked (numeric)
passTippedAtLine Boolean indicating whether the pass was tipped at the line of scrimmage (Boolean)
unblockedPressure Boolean indicating whether there was pressure from an unblocked player (Boolean)
qbSpike Boolean indicating whether the play was a QB Spike (Boolean)
qbKneel Whether or not the play was a QB Kneel (numeric)
qbSneak Whether or not the play was a QB Sneak (numeric)
rushLocationType The direction the runner ran based on where the offensive linemen were during the play (text)
penaltyYards Yards gained by offense by penalty (numeric)
prePenaltyYardsGained Net yards gained by the offense, before penalty yardage (numeric)
yardsGained Net yards gained by the offense, including penalty yardage (numeric)
homeTeamWinProbabilityAdded Win probability delta for home team (numeric)
visitorTeamWinProbabilityAdded Win probability delta for visitor team (numeric)
expectedPointsAdded Expected points added on this play (numeric)
isDropback Boolean indicating whether the QB dropped back, meaning the play resulted in a pass, sack, or scramble (Boolean)
pff_runConceptPrimary The primary run concept on the play (text)
pff_runConceptSecondary The secondary run concept on the play (text)
pff_runPassOption Whether or not the play was a run-pass option (numeric)
pff_passCoverage The pass coverage concept employed by the defense on the play (text)
pff_manZone Whether the defense employed man or zone coverage on the play (text)
Table: Data Dictionary for player_play.csv
Variable Description
gameId Game identifier, unique (numeric)
playId Play identifier, not unique across games (numeric)
nflId Player identification number, unique across players (numeric)
teamAbbr The team abbreviation for the team the player plays for (text)
hadRushAttempt Whether or not the player had a rushing attempt on this play (numeric)
rushingYards The rush yards accrued by the player on this play (numeric)
hadDropback Whether or not the player dropped back on this play (numeric)
passingYards The pass yards accrued by the player on this play (numeric)
sackYardsOffense The yards lost by the player via a sack on this play (numeric)
hadPassReception Whether or not the player caught a pass on this play (numeric)
receivingYards The receiving yards accrued by the player on this play (numeric)
wasTargettedReceiver Whether or not the player was the intended receiver on this play (numeric)
yardageGainedAfterTheCatch The yards gained after the catch was made by the player on this play (numeric)
fumbles The number of fumbles by the player on this play (numeric)
fumbleLost Whether or not the player lost a fumble to the opposing team on this play (numeric)
fumbleOutOfBounds Whether or not the player fumbled the ball out of bounds on this play (numeric)
assistedTackle Whether or not the player required an assist to make a tackle on this play (numeric)
forcedFumbleAsDefense Whether or not the player forced a fumble by the opposing team on this play (numeric)
halfSackYardsAsDefense The yards conceded by the offense because of a half-sack by the player on this play (numeric)
passDefensed Whether or not a passing play was stopped by the player on this play (numeric)
quarterbackHit Whether or not the player recorded a QB hit on this play (numeric)
sackYardsAsDefense The yards conceded by the offense because of a sack by the player on this play (numeric)
safetyAsDefense Whether or not the player forced a safety on this play (numeric)
soloTackle Whether or not the player recorded a solo tackle on this play (numeric)
tackleAssist Whether or not the player was awarded an assisted tackle on this play (numeric)
tackleForALoss Whether or not the player recorded a tackle behind the line of scrimmage on this play (numeric)
tackleForALossYardage The yards conceded by the offense because of a tackle behind the line of scrimmage by the player on this play (numeric)
hadInterception Whether or not the player intercepted a pass on this play (numeric)
interceptionYards The yards returned by the player on an intercepted pass on this play (numeric)
fumbleRecoveries The number of fumbles recovered by the player on this play (numeric)
fumbleRecoveryYards The yards returned by the player on a fumble recovery on this play (numeric)
wasInitialPassRusher Whether or not the player was the initial pass rusher on this play (numeric)
penaltyNames The names of all the penalties that were called on this player on this play (text)
causedPressure Boolean indicating whether the player pressured the QB, defined as achieving a peak pressure probability greater than or equal to 0.75 over the course of a dropback (Boolean)
timeToPressureAsPassRusher The time elapsed from snap to the first instance of this player reaching a pressure probability greater than or equal to 0.75 (numeric)
getOffAsPassRusher The time it took for this player to cross the line of scrimmage as a pass rusher after the ball was snapped (numeric)
inMotionAtBallSnap Boolean indicating whether the player was in motion at snap (Boolean)
shiftSinceLineset Boolean indicating whether the player shifted since the lineset (Boolean)
motionSinceLineset Boolean indicating whether the player went in motion after they were initially set at the line on this play (Boolean)
wasRunningRoute Boolean indicating if the player was running a route on this play (Boolean)
routeRan The name of the route ran by the player on this play (text)
blockedPlayerNFLId1 The NFL player ID of the primary opponent being blocked by the player on this play (numeric)
blockedPlayerNFLId2 The NFL player ID of the secondary opponent being blocked by the player on this play (numeric)
blockedPlayerNFLId3 The NFL player ID of the tertiary opponent being blocked by the player on this play (numeric)
pressureAllowedAsBlocker Whether or not any of the pass rushers that the blocker had a true matchup against recorded a pressure on this play (numeric)
timeToPressureAllowedAsBlocker The time elapsed from snap to the first instance of a pass rusher who the blocker had a true matchup against achieving a pressure probability above 0.75 on this play (numeric)
pff_defensiveCoverageAssignment The specific defensive coverage assignment given to the player on this play (text)
pff_primaryDefensiveCoverageMatchupNflId The NFL player ID of the opponent tagged as the primary matchup in coverage for the defender on this play (numeric)
pff_secondaryDefensiveCoverageMatchupNflId The NFL player ID of the opponent tagged as the secondary matchup in coverage for the defender on this play (numeric)

Missingness Visualizations and Data Cleaning

player_play_missing <- player_play |>
  summarise(across(everything(), ~mean(is.na(.)) * 100)) |>
  pivot_longer(everything(), names_to = "variable", values_to = "missing_percent") |>
  filter(missing_percent > 0) |>
  arrange(desc(missing_percent))

player_play_missing |>
  ggplot(aes(x = reorder(variable, -missing_percent), y = missing_percent)) +
  geom_bar(stat = "identity", fill = "#013369") +
  coord_flip() +
  labs(title = "Missingness in `player_play` Dataset",
       x = "Variable", y = "Percent Missing") +
  theme_minimal()

These missing pieces are largely related to what these variables are signifying. For instance the timeToPresssureAsPassRusher only applies to the offensive line positions that are blocking. Thus the main pattern of missingness here is position dependent, and with a data set featuring every position on the field for every play, its clear why there are many vars with high missing rates.

play_missing <- plays |>
  summarise(across(everything(), ~mean(is.na(.)) * 100)) |>
  pivot_longer(everything(), names_to = "variable", values_to = "missing_percent") |>
  filter(missing_percent > 0) |>
  arrange(desc(missing_percent))

play_missing |>
  ggplot(aes(x = reorder(variable, -missing_percent), y = missing_percent)) +
  geom_bar(stat = "identity", fill = "#013369") +
  coord_flip() +
  labs(title = "Missingness in `plays` Dataset",
       x = "Variable", y = "Percent Missing") +
  theme_minimal()

The main pattern of missingness in this dataset comes from the type of play ran, when the play is a pass, all of the rush related variables are missing and vice versa.

players_missing <- players |>
  summarise(across(everything(), ~mean(is.na(.)) * 100)) |>
  pivot_longer(everything(), names_to = "variable", values_to = "missing_percent") |>
  filter(missing_percent > 0) |>
  arrange(desc(missing_percent))

players_missing |>
  ggplot(aes(x = reorder(variable, -missing_percent), y = missing_percent)) +
  geom_bar(stat = "identity", fill = "#013369") +
  coord_flip() +
  labs(title = "Missingness in players Dataset",
       x = "Variable", y = "Percent Missing") +
  theme_minimal()

games_missing <- games |>
  summarise(across(everything(), ~mean(is.na(.)) * 100)) |>
  pivot_longer(everything(), names_to = "variable", values_to = "missing_percent") |>
  filter(missing_percent > 0) |>
  arrange(desc(missing_percent))

games_missing |>
  ggplot(aes(x = reorder(variable, -missing_percent), y = missing_percent)) +
  geom_bar(stat = "identity", fill = "#013369") +
  coord_flip() +
  labs(title = "Missingness in games Dataset",
       x = "Variable", y = "Percent Missing") +
  theme_minimal()

No missing vars in the games data set

Overall I’m not going to do anything in order to handle the missing data, because the missing observations are context dependent to their variable.

String Manipulation

Expand Dropback Outcomes

plays <- plays |>
  mutate(passResultFull = case_when(
    passResult == "C" ~ "Complete pass",
    passResult == "I" ~ "Incomplete pass",
    passResult == "S" ~ "Quarterback sack",
    passResult == "IN" ~ "Intercepted pass",
    passResult == "R" ~ "Scramble",
    TRUE ~ "Unknown"
  ))

plays|>
  group_by(passResultFull) |>
  summarise(Count = n(), .groups = "drop")
## # A tibble: 6 × 2
##   passResultFull   Count
##   <chr>            <int>
## 1 Complete pass     5624
## 2 Incomplete pass   2911
## 3 Intercepted pass   193
## 4 Quarterback sack   608
## 5 Scramble           400
## 6 Unknown           6388

Conference Labeling for Players

add_conference <- function(college_name) {
  case_when(
    str_detect(college_name, "Alabama|Auburn|Georgia|Florida|LSU|Tennessee|Texas A&M|Vanderbilt|Arkansas|Mississippi|South Carolina|Oklahoma|Texas") ~ "SEC",
    str_detect(college_name, "Michigan|Ohio State|Penn State|Wisconsin|Iowa|Nebraska|Northwestern|Illinois|Indiana|Purdue|Minnesota|Rutgers|Maryland|Oregon|Washington|Southern California|UCLA") ~ "Big Ten",
    str_detect(college_name, "Clemson|Miami|Florida State|North Carolina|Virginia Tech|Pittsburgh|Duke|Georgia Tech|Louisville|NC State|Syracuse|Wake Forest|Stanford|California|Southern Methodist|Virginia") ~ "ACC",
    str_detect(college_name, "Baylor|Texas Christian|Texas Tech|Oklahoma State|Iowa State|West Virginia|Kansas|Kansas State|Cincinnati|Central Florida|Brigham Young|Houston|Colorado|Arizona|Arizona State|Utah") ~ "Big 12",
    TRUE ~ "Other"
  )
}

players <- players |>
  mutate(Conference = add_conference(collegeName))

conference_summary <- players |>
  group_by(Conference) |>
  summarize(PlayerCount = n(), .groups = "drop")

conference_summary
## # A tibble: 5 × 2
##   Conference PlayerCount
##   <chr>            <int>
## 1 ACC                233
## 2 Big 12             123
## 3 Big Ten            443
## 4 Other              411
## 5 SEC                487

Exploratory Data Analysis

Tables of Summary Statistics and Merged Data

Joined Data Analyzing RB Routes

position_route_table <- player_play |>
  inner_join(players, by = "nflId") |>
  filter(position %in% c("RB", "TE", "WR"), !is.na(routeRan)) |>
  count(position, routeRan, name = "route_count") |>
  group_by(position) |>
  mutate(
    route_percentage = round((route_count / sum(route_count)) * 100, 2)
  ) |>
  arrange(position, desc(route_count)) |>
  ungroup()

position_route_table <- position_route_table |>
  select(position, routeRan, route_count, route_percentage)

position_route_table_flextable <- flextable(position_route_table)

position_route_table_flextable

position

routeRan

route_count

route_percentage

RB

FLAT

3,693

48.55

RB

ANGLE

1,618

21.27

RB

OUT

740

9.73

RB

SCREEN

591

7.77

RB

GO

310

4.08

RB

HITCH

207

2.72

RB

WHEEL

172

2.26

RB

CROSS

116

1.52

RB

SLANT

90

1.18

RB

IN

51

0.67

RB

POST

16

0.21

RB

CORNER

3

0.04

TE

HITCH

1,735

18.12

TE

FLAT

1,437

15.01

TE

CROSS

1,305

13.63

TE

GO

1,300

13.58

TE

OUT

1,062

11.09

TE

IN

715

7.47

TE

POST

569

5.94

TE

CORNER

555

5.80

TE

SLANT

468

4.89

TE

SCREEN

381

3.98

TE

ANGLE

37

0.39

TE

WHEEL

11

0.11

WR

GO

6,139

24.51

WR

HITCH

4,312

17.22

WR

CROSS

2,719

10.86

WR

IN

2,595

10.36

WR

OUT

2,438

9.73

WR

POST

2,110

8.42

WR

SLANT

1,731

6.91

WR

CORNER

1,272

5.08

WR

FLAT

869

3.47

WR

SCREEN

803

3.21

WR

ANGLE

37

0.15

WR

WHEEL

20

0.08

Analyzing the Type of Defenses Teams Run in AFC

afc_teams <- c("BAL", "BUF", "CIN", "CLE", "DEN", "HOU", "IND", "JAX", 
               "KC", "LV", "LAC", "MIA", "NE", "NYJ", "PIT", "TEN")

afc_games <- games |>
  filter(homeTeamAbbr %in% afc_teams | visitorTeamAbbr %in% afc_teams)

afc_defensive_plays <- plays |>
  filter(defensiveTeam %in% afc_teams)

defense_summary <- afc_defensive_plays |>
  group_by(defensiveTeam, pff_passCoverage) |>
  summarise(Count = n(), .groups = "drop")

defense_percentage <- defense_summary |>
  group_by(defensiveTeam) |>
  mutate(Percent = round(Count / sum(Count) * 100, 2)) |>
  ungroup()

defense_wide <- defense_percentage |>
  select(defensiveTeam, pff_passCoverage, Percent) |>
  pivot_wider(names_from = pff_passCoverage, values_from = Percent, values_fill = 0)

defense_table <- defense_wide |>
  flextable() |>
  set_header_labels(defensiveTeam = "Team") |>
  set_caption("AFC Teams: Defensive Coverage Percentage Breakdown")|>
  theme_vanilla() |>
  autofit()

defense_table
AFC Teams: Defensive Coverage Percentage Breakdown

Team

2-Man

Bracket

Cover 6-Left

Cover-0

Cover-1

Cover-1 Double

Cover-2

Cover-3

Cover-3 Seam

Cover-6 Right

Goal Line

Prevent

Quarters

Red Zone

NA

Cover-3 Cloud Left

Cover-3 Cloud Right

Miscellaneous

Cover-3 Double Cloud

BAL

0.56

1.13

5.44

7.13

16.32

0.38

9.94

28.52

4.13

6.38

1.69

1.31

13.13

3.00

0.94

0.00

0.00

0.00

0.00

BUF

2.90

0.22

6.24

2.23

20.04

0.00

18.93

21.60

0.67

5.79

0.22

0.00

16.26

4.23

0.22

0.22

0.22

0.00

0.00

CIN

0.75

0.56

4.31

7.49

22.28

0.75

13.11

26.78

3.37

2.06

1.87

0.19

13.30

1.87

1.12

0.00

0.00

0.19

0.00

CLE

1.54

0.00

3.73

3.73

18.20

0.00

6.58

36.84

3.51

2.19

1.75

0.44

15.35

3.95

1.97

0.00

0.00

0.00

0.22

DEN

0.61

0.00

8.59

1.43

13.70

0.20

1.23

38.65

10.02

9.00

1.64

0.20

11.04

2.25

1.23

0.00

0.20

0.00

0.00

HOU

0.00

0.00

1.23

0.61

21.68

0.00

24.34

29.86

2.66

1.84

0.61

0.41

9.41

4.50

1.64

0.61

0.41

0.20

0.00

IND

0.00

0.19

1.90

0.95

16.76

0.00

5.71

51.24

3.62

0.76

0.19

0.19

11.05

5.52

1.90

0.00

0.00

0.00

0.00

JAX

2.26

0.00

4.14

1.32

22.60

0.38

13.94

31.45

5.65

4.71

1.51

0.38

6.78

4.14

0.75

0.00

0.00

0.00

0.00

KC

3.43

1.21

4.23

6.85

20.16

1.21

23.39

11.29

1.81

4.84

2.02

0.00

16.94

1.01

1.01

0.00

0.60

0.00

0.00

LAC

0.85

0.85

5.98

3.85

21.79

0.43

4.06

24.57

8.33

8.12

2.78

0.00

11.97

4.91

1.50

0.00

0.00

0.00

0.00

LV

0.40

0.61

3.24

6.68

21.05

0.61

15.99

28.34

0.40

2.83

0.61

0.00

14.37

1.82

1.01

1.01

1.01

0.00

0.00

MIA

1.88

0.94

0.38

9.38

28.52

0.38

13.32

36.59

0.38

0.38

1.88

0.75

1.31

2.44

1.31

0.19

0.00

0.00

0.00

NE

0.36

0.91

1.27

2.54

33.94

0.36

9.62

35.75

1.09

1.27

0.36

0.36

8.53

2.00

1.09

0.18

0.18

0.18

0.00

NYJ

0.72

0.18

6.09

2.51

24.19

0.00

1.97

27.42

0.72

5.91

0.18

0.54

26.16

1.97

1.08

0.18

0.18

0.00

0.00

PIT

0.79

0.79

3.54

2.55

27.11

1.77

22.59

31.63

0.79

2.36

0.79

0.00

2.55

1.38

1.38

0.00

0.00

0.00

0.00

TEN

0.83

2.07

1.04

2.07

19.46

0.41

16.36

23.81

0.41

3.31

0.21

0.41

23.60

5.18

0.41

0.00

0.00

0.00

0.41

NFC Rush Yards Summary

nfc_teams <- c("ARI", "ATL", "CAR", "CHI", "DAL", "DET", "GB", "LAR", 
               "MIN", "NO", "NYG", "PHI", "SEA", "SF", "TB", "WAS")

nfc_rush_plays <- plays |>
  filter(possessionTeam %in% nfc_teams & !is.na(rushLocationType) 
         & is.na(penaltyYards))
rush_summary <- nfc_rush_plays |>
  group_by(possessionTeam, rushLocationType) |>
  summarize(AverageRushYards = round(mean(yardsGained, na.rm = TRUE), 2), .groups = "drop")

rush_summary_wide <- rush_summary |>
  pivot_wider(names_from = rushLocationType, values_from = AverageRushYards, values_fill = 0)

rush_table <- rush_summary_wide |>
  flextable() |>
  set_header_labels(possessionTeam = "Team") |>
  set_caption("NFC Teams: Average Rush Yards by Rush Location") |>
  theme_vanilla() |>
  autofit()

rush_table
NFC Teams: Average Rush Yards by Rush Location

Team

INSIDE_LEFT

INSIDE_RIGHT

OUTSIDE_LEFT

OUTSIDE_RIGHT

UNKNOWN

ARI

5.29

5.25

3.10

3.14

-1.00

ATL

4.41

4.82

5.12

4.87

-1.00

CAR

5.14

4.55

4.94

4.60

-1.00

CHI

5.68

6.32

5.22

4.45

-0.60

DAL

6.68

4.33

4.81

4.17

-1.00

DET

6.25

4.84

4.60

4.10

-1.00

GB

7.00

4.15

5.00

3.51

-1.00

MIN

5.47

2.85

4.61

4.69

-2.50

NO

5.52

4.87

4.52

5.69

-1.00

NYG

5.23

6.06

5.32

3.47

-1.33

PHI

5.50

4.63

2.90

3.90

-1.11

SEA

4.21

6.52

1.90

7.45

-1.00

SF

4.34

5.17

4.50

3.87

-1.00

TB

3.98

3.28

2.34

2.19

-0.83

WAS

4.18

4.17

4.60

4.23

-0.75

Data Visualizations

Throwing Heat Map of Quarterbacks with more than 50 dropbacks

filtered_plays <- plays |>
  filter(!is.na(targetX) & !is.na(targetY))

filtered_plays <- filtered_plays |>
  mutate(
    X_bin = cut(targetX, breaks = seq(0, 120, by = 1), include.lowest = TRUE, labels = FALSE), 
    Y_bin = cut(targetY, breaks = seq(0, 53.3, by = 1), include.lowest = TRUE, labels = FALSE)
  )

zone_counts <- filtered_plays |>
  group_by(X_bin, Y_bin) |>
  summarize(Count = n(), .groups = "drop")

zone_counts <- complete(zone_counts, X_bin = 1:120, Y_bin = 1:53, fill = list(Count = 0))

ggplot(zone_counts, aes(x = X_bin, y = Y_bin, fill = Count)) +
  geom_tile(color = "white") +
  scale_fill_gradient(low = "blue", high = "red") +
  geom_vline(xintercept = seq(0, 120, by = 10), color = "white", linetype = "dashed", size = 0.5) +
  geom_vline(xintercept = c(10, 110), color = "black", linetype = "dashed", size = 1) +
  theme_minimal() +
  theme(
    panel.background = element_rect(fill = "green4", color = "green4"),
    panel.grid = element_blank(),
    axis.text = element_text(color = "black", size = 8, face = "bold"),
    axis.title = element_text(color = "black", size = 10, face = "bold"),
    plot.title = element_text(color = "black", size = 12, hjust = 0.5, face = "bold"),
    axis.text.x = element_text(angle = 45, hjust = 1),
    axis.ticks = element_line(color = "white")
  ) +
  labs(
    title = "Heatmap of Passing Target Locations (All Quarterbacks)",
    x = "Field Length (Yards)",
    y = "Field Width (Yards)",
    fill = "Number of Throws"
  ) +
  coord_fixed()

Here we have a football field broken up into 1x1-yard bins that count the number of throws targeted in that area. Some takeaways from this show that between the 30-yard lines there are the most throws generally, which makes sense given that is where most of the game takes place. But speaking from a width perspective, it’s clear that along the hash marks to the sidelines are a much more prevalent target area for quarterbacks compared to the middle of the field. Another observation is that the throws into the end zone are hot around the corners, with the middle seeing far fewer targets overall.

PS I endlessly struggled to get the 50 yard line as the center of the chart but things seemed to fall apart in other places when scaling the x axis, and this was the best I could do.

EPA Per Play for AFC West by Week

plays_with_week <- plays |>
  inner_join(games |> 
               select(gameId, week), by = "gameId")

afc_west_teams <- c("DEN", "KC", "LV", "LAC") 
afc_west_plays <- plays_with_week |>
  filter(possessionTeam %in% afc_west_teams & !is.na(expectedPointsAdded))

afc_west_epa <- afc_west_plays |>
  group_by(possessionTeam, week) |>
  summarize(AverageEPA = mean(expectedPointsAdded, na.rm = TRUE), .groups = "drop")

ggplot(afc_west_epa, aes(x = week, y = AverageEPA, color = possessionTeam, group = possessionTeam)) +
  geom_line(size = 1) +
  geom_point(size = 2) +
  labs(
    title = "Average EPA Per Play by Game for AFC West Teams",
    x = "Week",
    y = "Average Expected Points Added (EPA)",
    color = "Team"
  ) +
  theme_minimal() +
  scale_x_continuous(breaks = 1:9) + 
  scale_color_manual(values = c("#FB4F14", "#E31837", "#0080C6", "#A5ACAF"))

The Best way to look at this graph is to compare it against the W/L record for each team through 9 weeks in 2022. The Broncos at that time had a 3-5 record, Raiders 2-6, Chargers 5-3, and Chiefs 6-2. By looking at the EPA per play on a week by week basis it would track that the Broncos and Raiders would be at the bottom, although somewhat surprisingly the Broncos were able to eek out one more win in the 9 weeks with their failure of an offense. The Chiefs having 0 weeks with negative EPA/Play is illustrated by the best record of the bunch.

Yards Gained by Coverage Type

nfc_north_teams <- c("GB", "MIN", "CHI", "DET") 
nfc_north_plays <- plays |>
  filter(defensiveTeam %in% nfc_north_teams & !is.na(pff_passCoverage) & !is.na(yardsGained))

nfc_north_coverage <- nfc_north_plays |>
  group_by(defensiveTeam, pff_passCoverage) |>
  summarize(Snaps = n(), AverageYards = mean(yardsGained, na.rm = TRUE), .groups = "drop") |>
  filter(Snaps >= 30)

ggplot(nfc_north_coverage, aes(x = defensiveTeam, y = AverageYards, fill = pff_passCoverage)) +
  geom_bar(stat = "identity", position = position_dodge(width = 0.9)) +
  geom_text(
    aes(label = Snaps), 
    position = position_dodge(width = 0.9), 
    vjust = -0.5,
    size = 3
  ) +
  labs(
    title = "Average Yards Gained by Coverage Type for NFC North (Min 30 Snaps)",
    x = "Team",
    y = "Average Yards Gained",
    fill = "Coverage Type"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

There are a few interesting observations that come from this chart. First being that in the first half of 2022, Chicago and Green Bay played only three coverage for the majority of their snaps. Compare that to Minnesota who played 7 coverages over 30 times, with both cover 6 techniques leading to the worst results in the division on a per play basis.

Success Rate of the Denver Broncos by Quarter

league_plays <- plays |>
  filter(!is.na(yardsToGo) & !is.na(yardsGained))

league_plays <- league_plays |>
  mutate(
    success = case_when(
      down == 1 & yardsGained >= 0.4 * yardsToGo ~ 1,
      down == 2 & yardsGained >= 0.6 * yardsToGo ~ 1,
      (down == 3 | down == 4) & yardsGained >= yardsToGo ~ 1,
      TRUE ~ 0
    )
  )

broncos_success <- league_plays |>
  filter(possessionTeam == "DEN" & quarter %in% 1:4) |>
  group_by(quarter, down) |>
  summarize(SuccessRate = mean(success, na.rm = TRUE) * 100, .groups = "drop") |>
  mutate(Type = "Broncos")

league_success <- league_plays |>
  filter(quarter %in% 1:4) |>
  group_by(quarter, down) |>
  summarize(SuccessRate = mean(success, na.rm = TRUE) * 100, .groups = "drop") |>
  mutate(Type = "League Average")

success_comparison <- bind_rows(broncos_success, league_success)

ggplot(success_comparison, aes(x = factor(quarter), y = SuccessRate, fill = Type)) +
  geom_bar(stat = "identity", position = "dodge") +
  facet_wrap(~down, labeller = labeller(down = c("1" = "1st Down", "2" = "2nd Down", "3" = "3rd Down", "4" = "4th Down"))) +
  labs(
    title = "Success Rate by Down and Quarter: Broncos vs League Average",
    x = "Quarter",
    y = "Success Rate (%)",
    fill = "Team"
  ) +
  scale_fill_manual(values = c("Broncos" = "#FB4F14", "League Average" = "gray")) +
  theme_minimal()

Here is another visualization illustrating my beloved Bronco’s anemic offense under Russell Wilson during the first half of 2022. Success rate is defined by pro-football-reference as “A play that gains at least 40% of yards required on 1st down, 60% of yards required on 2nd down, and 100% on 3rd or 4th down” We can see by this chart that the Broncos were below league average in every single instance in downs 1-3, and only higher in 4th down of the second quarter. This graph alone justifies eating the 50 million dollars in dead cap to get Russell Wilson off the team, no matter how successful he is operating the Steelers offense presently.

Time to Throw vs Yards Gained Interactive Graph

nfc_teams <- c("DAL", "PHI", "NYG", "WAS", "GB", "CHI", "MIN", "DET", 
               "TB", "NO", "CAR", "ATL", "SF", "SEA", "ARI", "LAR")

nfc_completed_passes <- plays |>
  filter(
    possessionTeam %in% nfc_teams & 
    passResult == "C" & 
    !is.na(timeToThrow) & 
    !is.na(yardsGained)
  ) |>
  mutate(
    playDescriptionShort = substr(playDescription, 1, 50)
  )

correlation <- cor(nfc_completed_passes$timeToThrow, nfc_completed_passes$yardsGained, use = "complete.obs")

scatter_plot <- 
  ggplot(nfc_completed_passes, aes(x = timeToThrow, y = yardsGained, color = possessionTeam)) +
  geom_point(aes(text = paste(
    "Team:", possessionTeam,
    "<br>Yards Gained:", yardsGained,
    "<br>Time to Throw:", timeToThrow,
    "<br>Play Description:", playDescriptionShort
  )), alpha = 0.7, size = 2) +
  geom_smooth(method = "lm", color = "red", linetype = "dashed", size = 1) +
  annotate("text", x = max(nfc_completed_passes$timeToThrow) * 0.7, 
           y = max(nfc_completed_passes$yardsGained) * 0.9, 
           label = paste("Correlation: ", round(correlation, 2)), 
           size = 4, color = "black") +
  labs(
    title = "NFC Teams: Time to Throw vs. Yards Gained (Completed Passes)",
    x = "Time to Throw (Seconds)",
    y = "Yards Gained",
    color = "Team"
  ) +
  theme_minimal()

interactive_plot <- ggplotly(scatter_plot, tooltip = "text")
interactive_plot

Somewhat to my surprise, the correlation between time to throw and yards gained from completed passes is pretty low at 0.29, I imagine if I included the incomplete passes in the regression, the r value would approach 0. Perhaps this is due to the fact that, while QBs may have more time, the primary job of the play is to hit your first read and accomplish the task of the play design, as opposed to more time allowing for players to get down field more. I ran this without shortening the play description and my computer almost exploded, so the limit to 50 characters is due to that reason.

#Monte Carlo Methods of Inference

# Filter data for dropback plays with valid yards gained and play action labels
play_action_data <- plays |>
  filter(!is.na(playAction) & isDropback == TRUE & !is.na(yardsGained)) |>
  mutate(playAction = ifelse(playAction, "Play Action", "No Play Action"))

# Calculate observed test statistic (difference in medians)
observed_diff <- play_action_data |>
  group_by(playAction) |>
  summarize(median_yards = median(yardsGained)) |>
  summarize(diff = diff(median_yards)) |>
  pull(diff)

# Perform permutation test
set.seed(1999)
n_permutations <- 1000
null_distribution <- replicate(n_permutations, {
  shuffled_data <- play_action_data |>
    mutate(playAction = sample(playAction))
  shuffled_diff <- shuffled_data |>
    group_by(playAction) |>
    summarize(median_yards = median(yardsGained)) |>
    summarize(diff = diff(median_yards)) |>
    pull(diff)
  return(shuffled_diff)
})

# Calculate p-value
p_value <- mean(abs(null_distribution) >= abs(observed_diff))

# Summarize medians for each group
group_medians <- play_action_data |>
  group_by(playAction) |>
  summarize(median_yards = median(yardsGained), n = n())

# Extract medians for Play Action and No Play Action
play_action_median <- group_medians |>
  filter(playAction == "Play Action") |>
  pull(median_yards)
no_play_action_median <- group_medians |>
  filter(playAction == "No Play Action") |>
  pull(median_yards)

# Print results
cat("Play Action Median Yards Gained:", round(play_action_median, 2), "\n")
## Play Action Median Yards Gained: 4
cat("No Play Action Median Yards Gained:", round(no_play_action_median, 2), "\n")
## No Play Action Median Yards Gained: 4
cat("Observed Difference in Median Yards Gained:", round(observed_diff, 2), "\n")
## Observed Difference in Median Yards Gained: 0
# Visualize null distribution
library(ggplot2)
ggplot(data = data.frame(null_distribution), aes(x = null_distribution)) +
  geom_histogram(binwidth = 0.5, fill = "lightblue", color = "black") +
  geom_vline(xintercept = observed_diff, color = "red", linetype = "dashed", size = 1) +
  geom_vline(xintercept = quantile(null_distribution, probs = c(0.025, 0.975)), 
             color = "blue", linetype = "dotted", size = 1) +
  labs(
    title = "Null Distribution of Median Yards Gained Difference: Play Action vs No Play Action",
    x = "Difference in Median Yards Gained",
    y = "Frequency"
  ) +
  theme_minimal()

# Visualize observed distributions with density plots
ggplot(play_action_data, aes(x = yardsGained, fill = playAction)) +
  geom_density(alpha = 0.5) +
  scale_fill_manual(values = c("Play Action" = "blue", "No Play Action" = "red")) +
  geom_vline(aes(xintercept = median_yards), 
             data = group_medians, 
             color = c("blue", "red"), 
             linetype = "dashed", size = 1) +
  coord_cartesian(xlim = c(-10, 20)) +
  labs(
    title = "Distribution of Yards Gained: Play Action vs No Play Action",
    x = "Yards Gained",
    y = "Density",
    fill = "Play Type"
  ) +
  theme_minimal(base_size = 14)

# Interpret results
if (p_value < 0.05) {
  print(paste("The observed difference in median yards gained is statistically significant (p =", round(p_value, 3), ")."))
} else {
  print(paste("The observed difference in median yards gained is not statistically significant (p =", round(p_value, 3), ")."))
}
## [1] "The observed difference in median yards gained is not statistically significant (p = 1 )."

Conclusions / Main Takeaways

I found this exercise incredibly rewarding overall. I initially intended to have my data analyses follow a specific trend and theme, but I found that the 9 week sample was not enough to dive into any particular areas with any certainty. So instead I just went with my feel and curiosity in exploring different areas of the game that I found interesting. I think I created interesting visualizations that looked at all facets of the game of football, as well as diving into team and division analyses. Appreciate the great semester and I hope we get to connect in the future in other classes or research oppurtunities.